start_date <- "2017-01-01"
end_date <- "2019-12-31"
f1<-function(d2, d1){
n_weeks <- floor(as.numeric(difftime(d2, d1, units="weeks")))
}
f2<-function(d2, d1){
n_weeks <- floor(as.numeric(difftime(as.Date(d2)
, as.Date(d1), units = "weeks")))
}
m1<-microbenchmark(
Nocast = f1(end_date, start_date),
Cast = f2(end_date, start_date),
times = 1000
)
print(m1)
## Unit: microseconds
## expr min lq mean median uq max neval
## Nocast 324.846 334.0790 353.2882 337.0495 348.706 3455.632 1000
## Cast 116.097 119.7185 129.7460 121.1770 123.936 3095.600 1000
fbox_plot(m1, "microseconds")
no_size <- function (n){
x <- c()
for (i in seq(n)) {
x <- c(x, i)
}
}
explicit_size <- function (n){
x <- vector("integer", n)
for (i in seq(n)) {
x[i] <- i
}
}
m3 <- microbenchmark(
no_size = no_size(1e4),
explicit_size = explicit_size(1e4),
times = 10
)
print(m3)
## Unit: microseconds
## expr min lq mean median uq max
## no_size 68590.98 71457.882 72691.4349 72164.6660 73376.905 81942.750
## explicit_size 353.86 356.136 672.4449 357.6775 370.362 3465.972
## neval
## 10
## 10
fbox_plot(m3, "microseconds")
vector <- runif(1e8)
w1 <- function(x){
d <- length(which(x > .5))
}
w2 <- function(x){
d <- sum(x > .5)
}
m4 <- microbenchmark(
which = w1(vector),
nowhich = w2(vector),
times = 10
)
print(m4)
## Unit: milliseconds
## expr min lq mean median uq max neval
## which 635.0059 638.6475 670.2781 644.1300 653.8114 830.0850 10
## nowhich 223.3879 225.5231 240.4974 231.7384 239.6573 326.1463 10
fbox_plot(m4, "miliseconds")
n <- 1e4
dt <- data.table(
a = seq(n), b = runif(n)
)
v1 <- function(dt){
d <- mean(dt[dt$b > .5, ]$a)
}
v2 <- function(dt){
d <- mean(dt$a[dt$b > .5])
}
m5 <- microbenchmark(
row_operation = v1(dt),
column_operation = v2(dt),
times = 10
)
print(m5)
## Unit: microseconds
## expr min lq mean median uq max neval
## row_operation 190.566 199.413 1136.0205 214.1250 321.651 5874.159 10
## column_operation 74.338 77.264 362.1413 89.3025 126.296 2687.027 10
fbox_plot(m5, "microseconds")
The function seq prevents when the second part of the 1:x is zero
num <- 1e7
s1 <- function(num){
d <- mean(1:num)
}
s2 <- function(num){
d <- mean(seq(num))
}
m6<-microbenchmark(
noseq = s1(num),
seq = s2(num),
times = 30
)
print(m6)
## Unit: milliseconds
## expr min lq mean median uq max neval
## noseq 69.82843 69.92892 70.83228 69.98857 70.51530 74.16076 30
## seq 69.87508 69.97086 71.01114 70.04180 72.05789 74.46660 30
fbox_plot(m6, "miliseconds")
large_dataset <- data.table(
id = 1:1000000,
value = sample(letters, 1000000, replace = TRUE)
)
a1 <- function(x){
d <- x |> mutate(code = paste0(id, "_", value))
}
a2 <- function(x){
d <- x |> mutate(code = glue("{id}_{value}"))
}
m7 <- microbenchmark(
with_paste = a1(large_dataset),
with_glue = a2(large_dataset),
times = 20
)
print(m7)
## Unit: milliseconds
## expr min lq mean median uq max neval
## with_paste 586.4241 598.9025 634.3884 607.8835 615.2178 1158.0133 20
## with_glue 614.6962 620.3687 627.2505 623.5681 631.6961 653.8361 20
fbox_plot(m7, "miliseconds")
# Create a large list
big_list <- replicate(1e5, rnorm(10), simplify = FALSE)
m8 <- microbenchmark(
lapply = lapply(big_list, mean),
for_loop = {
result <- list()
for (i in seq_along(big_list)) {
result[[i]] <- mean(big_list[[i]])
}
},
times = 10
)
print(m8)
## Unit: milliseconds
## expr min lq mean median uq max neval
## lapply 308.5040 326.8918 356.7680 333.3494 379.6477 489.7449 10
## for_loop 334.2977 344.8711 379.9272 362.1968 383.8401 485.6251 10
fbox_plot(m8, "miliseconds")
dt <- data.table(
Date = as.Date('2023-01-01') + 0:99999,
iDate = as.IDate('2023-01-01') + 0:99999,
Value = rnorm(100000)
)
nd1 <- as.Date('2023-01-01')
nd2 <- as.Date('2023-01-10')
id1 <- as.IDate('2023-01-01')
id2 <- as.IDate('2023-01-10')
date_between_operation <- function(nd1, nd2) {
dt |> filter(Date >= nd1 & Date <= nd2)
}
idate_between_operation <- function(id1, id2) {
dt |> _[data.table::between(iDate, id1, id2)]
}
m9 <- microbenchmark(
Date = date_between_operation(nd1, nd2),
iDate = idate_between_operation(id1, id2),
times = 200L
)
print(m9)
## Unit: microseconds
## expr min lq mean median uq max neval
## Date 1464.836 1736.2070 1972.7392 1791.870 2157.2935 3758.328 200
## iDate 568.812 596.6795 686.3415 615.369 725.0735 2158.130 200
fbox_plot(m9, "miliseconds")
switch_function <- function(x) {
switch(x,
"a" = "apple",
"b" = "banana",
"c" = "cherry",
"default")
}
case_when_function <- function(x) {
case_when(
x == "a" ~ "apple",
x == "b" ~ "banana",
x == "c" ~ "cherry",
TRUE ~ "default"
)
}
# Create a vector of test values
test_values <- sample(c("a", "b", "c", "d"), 1000, replace = TRUE)
m10 <- microbenchmark(
switch = sapply(test_values, switch_function),
case_when = sapply(test_values, case_when_function),
times = 200L
)
print(m10)
## Unit: microseconds
## expr min lq mean median uq max
## switch 636.518 645.806 668.048 651.7675 660.253 2045.089
## case_when 215659.545 227684.179 231731.279 231078.8330 234194.967 371896.744
## neval
## 200
## 200
fbox_plot(m10, "microseconds")
set.seed(123)
n <- 1e6
data <- data.table(
id = seq(n),
value = sample(seq(100), n, replace = TRUE)
)
casewhenf <- function(data){
df <- data |>
mutate(category = case_when(
value <= 20 ~ "Low",
value <= 70 ~ "Medium",
value > 70 ~ "High"))
}
fcasef <- function(data){
df <- data |>
mutate(category = fcase(
value <= 20, "Low",
value <= 70, "Medium",
value > 70, "High"))
}
m11 <- microbenchmark(
case_when = casewhenf(data),
fcase = fcasef(data),
times = 20
)
print(m11)
## Unit: milliseconds
## expr min lq mean median uq max neval
## case_when 56.63917 57.74803 62.79273 60.70106 63.26526 85.27853 20
## fcase 20.56803 20.78007 22.06972 20.86613 22.44801 27.40792 20
fbox_plot(m11, "miliseconds")
set.seed(123)
DT <- data.table(
ID = 1:1e6,
Value1 = sample(c(NA, 1:100), 1e6, replace = TRUE),
Value2 = sample(c(NA, 101:200), 1e6, replace = TRUE)
)
# Define the functions
replace_na_f <- function(data){
DF <- data |>
mutate(Value1 = replace_na(Value1, 0),
Value2 = replace_na(Value2, 0)) |>
as.data.table()
}
fcoalesce_f <- function(data){
DF <- data |>
mutate(Value1 = fcoalesce(Value1, 0L),
Value2 = fcoalesce(Value2, 0L))
}
m12 <- microbenchmark(
treplace_na = replace_na_f(DT),
tfcoalesce = fcoalesce_f(DT),
times = 20
)
print(m12)
## Unit: milliseconds
## expr min lq mean median uq max neval
## treplace_na 7.696541 7.917915 8.392480 8.240813 8.383318 10.932434 20
## tfcoalesce 1.587544 1.801179 2.517975 2.116061 3.430525 4.121957 20
fbox_plot(m12, "miliseconds")
dt <- data.table(field_name = c("argentina.blue.man.watch",
"brazil.red.woman.shoes",
"canada.green.kid.hat",
"denmark.red.man.shirt"))
# Filter rows where 'field_name' does not contain 'red'
dtnot <- function(data){
filtered_dt <- data |> _[!grepl("red", field_name)]
}
dplyrnot <- function(data){
filtered_dt <- data |> filter(!grepl("red", field_name))
}
m13 <- microbenchmark(
data_table_not = dtnot(dt),
dplyrnot = dplyrnot(dt),
times = 100
)
print(m13)
## Unit: microseconds
## expr min lq mean median uq max neval
## data_table_not 98.965 111.6635 159.9027 134.4665 160.565 2078.652 100
## dplyrnot 649.944 670.0660 778.6597 688.2800 737.452 2724.448 100
fbox_plot(m13, "microseconds")
large_data <- data.table(
id = 1:100000,
var1 = rnorm(100000),
var2 = rnorm(100000),
var3 = rnorm(100000),
var4 = rnorm(100000)
)
# Benchmarking
m14 <- microbenchmark(
tidyr_pivot_longer = {
long_data_tidyr <- pivot_longer(large_data, cols = starts_with("var"),
names_to = "variable", values_to = "value")
},
data_table_melt = {
long_data_dt <- melt(large_data, id.vars = "id", variable.name = "variable",
value.name = "value")
},
times = 10
)
print(m14)
## Unit: microseconds
## expr min lq mean median uq max
## tidyr_pivot_longer 6311.746 6373.201 8997.8637 6467.141 6840.143 30905.990
## data_table_melt 444.791 484.875 581.3066 499.859 703.734 855.367
## neval
## 10
## 10
fbox_plot(m14, "microseconds")
vec1 <- seq(1000)
vec2 <- seq(1000)
# Define functions to be benchmarked
expand_grid_func <- function() {
return(expand_grid(vec1, vec2))
}
CJ_func <- function() {
return(CJ(vec1, vec2))
}
# Perform benchmarking
m15 <- microbenchmark(
expand_grid = expand_grid_func(),
CJ = CJ_func(),
times = 10
)
print(m15)
## Unit: microseconds
## expr min lq mean median uq max neval
## expand_grid 2271.882 2356.501 2764.077 2474.796 2992.037 4132.696 10
## CJ 458.717 482.571 776.949 598.964 919.878 1913.733 10
fbox_plot(m15, "microseconds")
# Sample data
size = 1e4
set.seed(44)
df_list <- replicate(50, data.table(id = sample(seq(size), size, replace = T),
value = rnorm(size)), simplify = F)
simple_bind <- function(list_of_dfs){
do.call(rbind, list_of_dfs)
}
dplyr_bind <- function(list_of_dfs){
bind_rows(list_of_dfs)
}
dt_bind <- function(list_of_dfs){
rbindlist(list_of_dfs, fill = F)
}
# Benchmark both methods
m16 <- microbenchmark(
dt_ver = dt_bind(df_list),
simple = simple_bind(df_list),
dplyr_ver = dplyr_bind(df_list),
times = 30
)
print(m16)
## Unit: microseconds
## expr min lq mean median uq max neval
## dt_ver 418.913 510.543 666.5973 554.6205 651.026 2054.636 30
## simple 485.848 521.012 676.3720 577.7635 628.263 2173.098 30
## dplyr_ver 10111.641 10247.625 10708.0204 10358.0360 11069.479 12881.703 30
fbox_plot(m16, "microseconds")
set.seed(123)
n <- 1e4
df <- data.table(text = paste("word1", "word2", "word3", "word4", "word5", sep = "."), stringsAsFactors = F)
df <- df[rep(1, n), , drop = F]
# Using tidyr::separate
separate_words <- function() {
df |>
separate(text, into = c("w1", "w2", "w3", "w4", "w5"), sep = "\\.", remove = F) |>
select(-c(w1, w2, w4))
}
# Using stringr::word
stringr_words <- function() {
df |>
mutate(
w3 = word(text, 3, sep = fixed(".")),
w5 = word(text, 5, sep = fixed("."))
)
}
datatable_words <- function() {
df |> _[, c("w3", "w5") := tstrsplit(text, "\\.")[c(3, 5)]]
}
m17 <- microbenchmark(
separate = separate_words(),
stringr = stringr_words(),
dt = datatable_words(),
times = 10
)
print(m17)
## Unit: milliseconds
## expr min lq mean median uq max neval
## separate 77.47755 80.21303 95.79084 90.54606 93.56122 179.10341 10
## stringr 184.64272 186.57560 192.32802 188.28115 201.82303 205.98428 10
## dt 12.51353 12.56643 13.04390 12.63552 13.03781 15.25742 10
fbox_plot(m17, "miliseconds")
# Sample data
set.seed(123)
n <- 1e6
df <- data.table(
x = rnorm(n),
y = sample(c(NA, 1:100), n, replace = TRUE),
z = sample(c(NA, letters), n, replace = TRUE),
stringsAsFactors = F
)
# Benchmark both methods
m18 <- microbenchmark(
dplyr_drop_na = {
df |> drop_na()
},
data_table_na_omit = {
dt |> na.omit()
},
times = 10
)
print(m18)
## Unit: microseconds
## expr min lq mean median uq max
## dplyr_drop_na 9368.785 9443.985 9511.2784 9455.4560 9506.521 9960.990
## data_table_na_omit 8.576 9.478 51.2127 55.0025 62.867 177.271
## neval
## 10
## 10
fbox_plot(m18, "microseconds")
# Sample data
set.seed(123)
size = 1e4
n_cores = parallelly::availableCores()
df_list <- replicate(100, data.table(id = sample(seq(size), size, replace = T),
value = rnorm(size)), simplify = F)
extra_df <- data.table(id = sample(seq(size), size, replace = T),
extra_value = runif(size))
# Sequential join
sequential_join <- function() {
lapply(df_list, function(df) {
merge(df, extra_df, by = "id", allow.cartesian = T)
})
}
# Parallel join using mclapply
parallel_join <- function() {
mclapply(df_list, function(df) {
merge(df, extra_df, by = "id", allow.cartesian = T)
}, mc.cores = n_cores, mc.silent = T, mc.cleanup = T)
}
# Benchmark both methods
m19 <- microbenchmark(
sequential = sequential_join(),
parallel = parallel_join(),
times = 10
)
print(m19)
## Unit: milliseconds
## expr min lq mean median uq max neval
## sequential 299.3829 309.7949 329.8663 317.0577 348.9830 386.8511 10
## parallel 151.8398 158.9782 169.3643 164.5133 175.4441 193.8522 10
fbox_plot(m19, "miliseconds")
with_parquet <- function(){
fp_data <- "/conf/posit_azure_logs/data"
data_1 <- open_dataset(file.path(glue::glue("{fp_data}/golden_data_in_progress"))) |>
select(
date, hours, time,
ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
ALL_WIP_CP_node_total, ALL_WIP_BP_node_total
) |>
mutate(
computepool_node_mem = ALL_WIP_CP_node_total * (160 * 1024),
bigpool_node_mem = ALL_WIP_BP_node_total * (256 * 1024),
ALL_WIP_day_session = ALL_WIP_CP_day_session + ALL_WIP_BP_day_session,
ALL_WIP_night_session = ALL_WIP_CP_night_session + ALL_WIP_BP_night_session,
ALL_WIP_node_total = ALL_WIP_CP_node_total + ALL_WIP_BP_node_total,
total_mem_limit = ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit,
total_mem_request = ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request,
total_mem_max = ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max,
total_node_mem = computepool_node_mem + bigpool_node_mem,
average_session_per_node = ifelse(ALL_WIP_node_total != 0,
(ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total, 0)
) |>
collect() |>
as.data.table()
}
with_duckfile <- function(){
file.copy("/conf/posit_azure_logs/gatzos01/gd_inprogress.duckdb", "gd_inprogress.duckdb")
data_2 <- res_duckdb_sql <- dbGetQuery(
conn = dbConnect(duckdb::duckdb(), dbdir = "./gd_inprogress.duckdb"),
statement = glue("select date, hours, time,
ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
ALL_WIP_CP_node_total, ALL_WIP_BP_node_total,
ALL_WIP_CP_node_total * 160 * 1024 as computepool_node_mem,
ALL_WIP_BP_node_total * 256 * 1024 as bigpool_node_mem,
ALL_WIP_CP_day_session + ALL_WIP_BP_day_session as ALL_WIP_day_session,
ALL_WIP_CP_night_session + ALL_WIP_BP_night_session as ALL_WIP_night_session,
ALL_WIP_CP_node_total + ALL_WIP_BP_node_total as ALL_WIP_node_total,
ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit as total_mem_limit,
ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request as total_mem_request,
ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max as total_mem_max,
computepool_node_mem + bigpool_node_mem as total_node_mem,
CASE
WHEN ALL_WIP_node_total != 0 THEN (ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total
ELSE 0
END AS average_session_per_node
from gdinprog"),
immediate = TRUE) |>
as.data.table()
file.remove("./gd_inprogress.duckdb")
}
m21 <- microbenchmark(
with_parquet = with_parquet(),
with_duckfile = with_duckfile(),
times = 3
)
print(m21)
## Unit: milliseconds
## expr min lq mean median uq max
## with_parquet 12636.6832 12857.6282 16831.3121 13078.573 18928.6265 24778.6797
## with_duckfile 464.8961 541.4405 607.0797 617.985 678.1714 738.3579
## neval
## 3
## 3
fbox_plot(m21, "miliseconds")